Explore the data contained in the EPA Hazardous Air Pollutants dataset. This dataset contains a summary of daily Hazardous Air Pollutants from 1990 to 2017.
Download the data from Kaggle.
In [1]:
import pandas as pd
from pivottablejs import pivot_ui
In [2]:
# CHANGE THIS TO THE LOCATION OF THE FILE
csv_file = "/Users/robert.dempsey/Downloads/epa_hap_daily_summary.csv"
In [3]:
hap_df = pd.DataFrame.from_csv(csv_file)
In [4]:
# Get the row and column counts
rows_cols = hap_df.shape
print("Rows: {}".format(rows_cols[0]))
print("Columns: {}".format(rows_cols[1]))
In [5]:
# List the columns and their types
hap_df.dtypes
Out[5]:
In [6]:
# Show summary statistics
hap_df.describe()
Out[6]:
In [7]:
# View the last five rows of the dataframe
hap_df.tail()
Out[7]:
In [8]:
# Print all columns and values for the last row in the dataset
hap_df.tail(1).to_dict(orient='records')
Out[8]:
In [9]:
# Check to see if there are any null values in any of the columns
null_values_present = hap_df.isnull().values.any()
if null_values_present:
nan_rows = hap_df[hap_df.isnull().T.any().T]
print("Rows with empty values: {}".format(len(nan_rows)))
print("Filling empty values")
hap_df.fillna(0, inplace=True)
print("Empty values filled")
In [10]:
# Create a dataframe from the last 60 rows and use PivotTable.js to view the data
hap_pivot_df = hap_df.tail(60)
# Create a column from the timestamp index and create a new numerical index
hap_pivot_df.reset_index(level=0, inplace=True)
# Show the pivot table
pivot_ui(hap_pivot_df)
Out[10]:
In [ ]: